Library
# Loading libraries
library(tidyverse)
library(readxl)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(reshape2)
library(viridisLite)
library(unikn)
library(readr)
library(caret)
library(class)
library(zoo)
library(fastDummies)
library(randomForest)
library(e1071)
library(xgboost)
library(ipred)
library(rpart)
library(plotly)
Uploading Data
# Uploading data
data1 <- read_csv("kidney.csv")
colnames(data1) <- c('ID', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
'anemia', 'class')
data <- as.data.frame(data1)
data_clean <- na.omit(data)
Change Variables from Objects to Numerical
# Change variables from objects to numerical
data$white_blood_cell_count <- as.numeric(data$white_blood_cell_count)
data$packed_cell_volume <- as.numeric(data$packed_cell_volume)
data$ID <- as.factor(data$ID)
# Replace values in the 'class' column
data$class <- ifelse(data$class == 'ckd', 0, 1)
head(data)
## ID age blood_pressure specific_gravity albumin sugar red_blood_cells pus_cell
## 1 0 48 80 1.020 1 0 <NA> normal
## 2 1 7 50 1.020 4 0 <NA> normal
## 3 2 62 80 1.010 2 3 normal normal
## 4 3 48 70 1.005 4 0 normal abnormal
## 5 4 51 80 1.010 2 0 normal normal
## 6 5 60 90 1.015 3 0 <NA> <NA>
## pus_cell_clumps bacteria blood_glucose_random blood_urea serum_creatinine
## 1 notpresent notpresent 121 36 1.2
## 2 notpresent notpresent NA 18 0.8
## 3 notpresent notpresent 423 53 1.8
## 4 present notpresent 117 56 3.8
## 5 notpresent notpresent 106 26 1.4
## 6 notpresent notpresent 74 25 1.1
## sodium potassium haemoglobin packed_cell_volume white_blood_cell_count
## 1 NA NA 15.4 44 7800
## 2 NA NA 11.3 38 6000
## 3 NA NA 9.6 31 7500
## 4 111 2.5 11.2 32 6700
## 5 NA NA 11.6 35 7300
## 6 142 3.2 12.2 39 7800
## red_blood_cell_count hypertension diabetes_mellitus coronary_artery_disease
## 1 5.2 yes yes no
## 2 <NA> no no no
## 3 <NA> no yes no
## 4 3.9 yes no no
## 5 4.6 no no no
## 6 4.4 yes yes no
## appetite peda_edema anemia class
## 1 good no no 0
## 2 good no no 0
## 3 poor no yes 0
## 4 poor yes yes 0
## 5 good no no 0
## 6 good yes no 0
Extracting Categorical and Numerical Columns
cat_cols <- colnames(data)[sapply(data, is.character)]
num_cols <- colnames(data)[sapply(data, is.numeric)]
## Replace incorrect values
data <- data %>%
mutate(diabetes_mellitus = ifelse(diabetes_mellitus %in% c('yes', 'no'), diabetes_mellitus, NA))
data$class <- as.numeric(data$class)
Checking numerical features distribution
ggplot(data = gather(data, key = "Feature", value = "Value", num_cols), aes(x = Value, fill = Feature)) +
geom_density(alpha = 0.5) +
facet_wrap(~Feature, scales = "free") +
theme_minimal() +
scale_fill_manual(values = cm.colors(length(num_cols))) +
labs(title = "Numeric variables distribution", x = "Value", y = "Density")
##Looking at categorical columns
# Define the color palette
my_colors <- hcl.colors(3, palette = "Cold")
# Define the categorical columns to plot
cat_cols_to_plot <- c("red_blood_cells", "pus_cell", "pus_cell_clumps", "bacteria",
"hypertension", "diabetes_mellitus", "coronary_artery_disease",
"appetite", "peda_edema", "anemia")
# Create a list to store the plots
plot_list <- list()
# Generate the plots and store them in the list
for (column in cat_cols_to_plot) {
# Filter values excluding NaN and empty values
filtered_data <- data %>%
filter(!is.na(!!sym(column)) & !!sym(column) != "") %>%
select("class", column)
p <- ggplot(filtered_data, aes_string(x = column, fill = column)) +
geom_bar() +
theme_minimal() +
labs(title = column) +
theme(legend.position = "none") +
scale_fill_manual(values = my_colors) # Color palette
plot_list[[column]] <- p
}
# Combine the plots into a single grid and display them in RStudio
combined_plot <- grid.arrange(grobs = plot_list, ncol = 4)
print(combined_plot)
## TableGrob (3 x 4) "arrange": 10 grobs
## z cells name grob
## red_blood_cells 1 (1-1,1-1) arrange gtable[layout]
## pus_cell 2 (1-1,2-2) arrange gtable[layout]
## pus_cell_clumps 3 (1-1,3-3) arrange gtable[layout]
## bacteria 4 (1-1,4-4) arrange gtable[layout]
## hypertension 5 (2-2,1-1) arrange gtable[layout]
## diabetes_mellitus 6 (2-2,2-2) arrange gtable[layout]
## coronary_artery_disease 7 (2-2,3-3) arrange gtable[layout]
## appetite 8 (2-2,4-4) arrange gtable[layout]
## peda_edema 9 (3-3,1-1) arrange gtable[layout]
## anemia 10 (3-3,2-2) arrange gtable[layout]
##Heatmap of data
numeric_data <- data[sapply(data, is.numeric)]
##Correlation matrix calculation
heatmap_data <- cor(numeric_data, use = "complete.obs")
ggplot(data = melt(heatmap_data), aes(Var2, Var1, fill = value)) +
geom_tile() +
theme_minimal() +
scale_fill_viridis_c(option = "viridis") + # Palette
geom_text(aes(label = round(value, 2)), vjust = 1) +
labs(title = "Correlation Heatmap") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Vertical titles
DENSITY PLOT AND VIOLIN PLOT FOR AGE
ggplot(data, aes(x = age, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Age by CKD Status", x = "Age", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot for the "age" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = age, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Age by CKD Status", x = "CKD Status", y = "Age") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR ALBUMIN
ggplot(data, aes(x = albumin, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Albumin by CKD Status", x = "Albumin", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with boxplot for the "Albumin" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = albumin, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Albumin by CKD Status", x = "CKD Status", y = "Albumin") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR BLOOD PRESSURE
# Blood Pressure
# Density plot for the "blood_pressure" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = blood_pressure, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Blood Pressure by CKD Status", x = "Blood Pressure", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with boxplot for the "blood_pressure" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = blood_pressure, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Blood Pressure by CKD Status", x = "CKD Status", y = "Blood Pressure") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR BLOOD GLUCOSE RANDOM
# Remove rows with missing values in the "blood_glucose_random" variable
data_cleaned_blood_glucose <- data[!is.na(data$blood_glucose_random), ]
ggplot(data_cleaned_blood_glucose, aes(x = blood_glucose_random, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Blood Glucose (Random) by CKD Status", x = "Blood Glucose (Random)", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Remove rows with missing values in the "blood_glucose_random" variable
data_cleaned_blood_glucose <- data[!is.na(data$blood_glucose_random), ]
# Violin plot with box plot for the "blood_glucose_random" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_glucose, aes(x = factor(class), y = blood_glucose_random, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Blood Glucose (Random) by CKD Status", x = "CKD Status", y = "Blood Glucose (Random)") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR BLOOD UREA
# Remove rows with missing values in the "blood_urea" variable
data_cleaned_blood_urea <- data[!is.na(data$blood_urea), ]
# Density plot for the "blood_urea" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_urea, aes(x = blood_urea, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Blood Urea by CKD Status", x = "Blood Urea", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Remove rows with missing values in the "blood_urea" variable
data_cleaned_blood_urea <- data[!is.na(data$blood_urea), ]
# Violin plot with inner box plot for the "blood_urea" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_urea, aes(x = factor(class), y = blood_urea, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Blood Urea by CKD Status", x = "CKD Status", y = "Blood Urea") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR HEMOGLOBIN
# Remove rows with missing values in the "haemoglobin" variable
data_cleaned_haemoglobin <- data[!is.na(data$haemoglobin), ]
# Density plot for the "haemoglobin" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_haemoglobin, aes(x = haemoglobin, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Hemoglobin by CKD Status", x = "Haemoglobin", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with inner box plot for the "haemoglobin" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_haemoglobin, aes(x = factor(class), y = haemoglobin, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Hemoglobin by CKD Status", x = "CKD Status", y = "Haemoglobin") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR SERUM CREATININE
# Remove rows with missing values in the "serum_creatinine" variable
data_cleaned_serum_creatinine <- data[!is.na(data$serum_creatinine), ]
# Density plot for the "serum_creatinine" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_serum_creatinine, aes(x = serum_creatinine, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
labs(title = "Distribution of Hemoglobin by CKD Status", x = "serum_creatinine", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with inner box plot for the "serum_creatinine" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_serum_creatinine, aes(x = factor(class), y = serum_creatinine, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#FF6F61", "#6F8A91"), labels = c("CKD", "NotCKD")) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Serum Creatinine by CKD Status", x = "CKD Status", y = "Serum Creatinine") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR SPECIFIC GRAVITY
# Remove rows with missing values in the "specific_gravity" variable
data_cleaned_specific_gravity <- data[!is.na(data$specific_gravity), ]
# Density plot for the "specific_gravity" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_specific_gravity, aes(x = specific_gravity, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("blue", "red"), labels = c("NotCKD", "CKD")) +
labs(title = "Distribution of Specific Gravity by CKD Status", x = "Specific Gravity", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with inner box plot for the "specific_gravity" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_specific_gravity, aes(x = factor(class), y = specific_gravity, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#6F8A91", "#FF6F61"), labels = c("NotCKD", "CKD")) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of Specific Gravity by CKD Status", x = "CKD Status", y = "Specific Gravity") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
DENSITY PLOT AND VIOLIN PLOT FOR WHITE BLOOD CELLS
# Remove rows with missing values in the "white_blood_cell_count" variable
data_cleaned_white_blood_cell_count <- data[!is.na(data$white_blood_cell_count), ]
# Density plot for the "white_blood_cell_count" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_white_blood_cell_count, aes(x = white_blood_cell_count, fill = factor(class))) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("blue", "red"), labels = c("NotCKD", "CKD")) +
labs(title = "Distribution of White Blood Cell Count by CKD Status", x = "White Blood Cell Count", y = "Density") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
# Violin plot with inner box plot for the "white_blood_cell_count" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_white_blood_cell_count, aes(x = factor(class), y = white_blood_cell_count, fill = factor(class))) +
geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("#6F8A91", "#FF6F61"), labels = c("NotCKD", "CKD")) +
scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
labs(title = "Distribution of White Blood Cell Count by CKD Status", x = "CKD Status", y = "White Blood Cell Count") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray", size = 0.2),
panel.grid.minor = element_blank())
data1 <- read_csv("kidney.csv")
colnames(data1) <- c('ID', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema',
'anemia', 'class')
data <- as.data.frame(data1)
data$white_blood_cell_count <- as.numeric(data$white_blood_cell_count)
data$packed_cell_volume <- as.numeric(data$packed_cell_volume)
data$red_blood_cell_count <-as.numeric(data$red_blood_cell_count)
data$ID <- as.factor(data$ID)
data_numeric <- data %>% select_if(is.numeric) # Select numeric variables
data_categorical <- data %>% select_if(is.character) # Select categorical variables
# Calculate the mean of each column
column_means <- colMeans(data_numeric, na.rm = TRUE)
# Find the positions of NA cells
nas <- is.na(data_numeric)
# Impute NA values with the respective means
data_numeric[nas] <- column_means[rep(1, sum(nas))]
data_categorical <- data_categorical %>%
mutate(
red_blood_cells = if_else(red_blood_cells == "normal", 0, 1),
pus_cell = if_else(pus_cell == "normal", 0, 1),
pus_cell_clumps = if_else(pus_cell_clumps == "notpresent", 0, 1),
bacteria = if_else(bacteria == "notpresent", 0, 1),
hypertension = if_else(hypertension == "no", 0, 1),
diabetes_mellitus = if_else(diabetes_mellitus == "no", 0, 1),
coronary_artery_disease = if_else(coronary_artery_disease == "no", 0, 1),
appetite = if_else(appetite == "good", 0, 1),
pedal_edema = if_else(pedal_edema == "no", 0, 1),
anemia = if_else(anemia == "no", 0, 1),
class = if_else(class == "notckd", 0, 1)
)
# Replace NA with mode (0 or 1)
data_categorical <- data_categorical %>%
mutate_all(~ ifelse(is.na(.), ifelse(sum(. == 0, na.rm = TRUE) >= sum(. == 1, na.rm = TRUE), 0, 1), .))
# List of binary column names
column_names <- c(
"red_blood_cells", "pus_cell", "pus_cell_clumps", "bacteria",
"hypertension", "diabetes_mellitus", "coronary_artery_disease",
"appetite", "pedal_edema", "anemia", "class"
)
# Convert numeric columns to factors
data_categorical[column_names] <- lapply(data_categorical[column_names], as.factor)
summary(data_categorical)
## red_blood_cells pus_cell pus_cell_clumps bacteria hypertension
## 0:353 0:324 0:358 0:378 0:253
## 1: 47 1: 76 1: 42 1: 22 1:147
## diabetes_mellitus coronary_artery_disease appetite pedal_edema anemia class
## 0:263 0:366 0:318 0:324 0:340 0:150
## 1:137 1: 34 1: 82 1: 76 1: 60 1:250
# Create dummy variables for categorical columns in data_categorical
data_dummy <- dummy_cols(data_categorical, select_columns = column_names)
# Select the first 11 variables of the data_dummy DataFrame
data_dummy_subset <- data_dummy[, 1:11]
summary(data_dummy_subset)
## red_blood_cells pus_cell pus_cell_clumps bacteria hypertension
## 0:353 0:324 0:358 0:378 0:253
## 1: 47 1: 76 1: 42 1: 22 1:147
## diabetes_mellitus coronary_artery_disease appetite pedal_edema anemia class
## 0:263 0:366 0:318 0:324 0:340 0:150
## 1:137 1: 34 1: 82 1: 76 1: 60 1:250
# Combine data_numeric and data_dummy_subset
data_combined <- cbind(data_numeric, data_dummy_subset)
# Define independent (features) and dependent (labels) variables
ind_col <- setdiff(names(data_combined), "class")
dep_col <- "class"
X <- data_combined[, ind_col]
y <- data_combined[, dep_col]
# Split the data into training and testing sets
set.seed(0) # Set a random seed for reproducibility
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]
# Train the KNN model
knn_model <- knn(X_train, X_test, y_train, k = 5)
knn_acc <- mean(knn_model == y_test)
# Print KNN model results
cat("Test Accuracy of KNN is:", knn_acc, "\n")
## Test Accuracy of KNN is: 0.7166667
# Train the decision tree model
dtc_model <- rpart(y_train ~ ., data = X_train)
dtc_pred <- predict(dtc_model, X_test, type = "class")
dtc_acc <- mean(dtc_pred == y_test)
# Print decision tree model results
cat("Test Accuracy of Decision Tree Classifier is:", dtc_acc, "\n")
## Test Accuracy of Decision Tree Classifier is: 0.925
# Train the random forest model
rf_model <- randomForest(X_train, as.factor(y_train))
rf_pred <- predict(rf_model, X_test)
rf_acc <- mean(rf_pred == y_test)
# Print random forest model results
cat("Test Accuracy of Random Forest Classifier is:", rf_acc, "\n")
## Test Accuracy of Random Forest Classifier is: 1
# Create a data frame for models
models <- data.frame(
Model = c('KNN', 'Decision Tree Classifier', 'Random Forest Classifier'),
Score = c(knn_acc, dtc_acc, rf_acc)
)
# Sort the data frame by Score in descending order
models <- models[order(models$Score, decreasing = TRUE), ]
# Print the sorted data frame
print(models)
## Model Score
## 3 Random Forest Classifier 1.0000000
## 2 Decision Tree Classifier 0.9250000
## 1 KNN 0.7166667
# Create a bar plot
plot_ly(
data = models,
x = ~Score,
y = ~Model,
color = ~Score,
type = "bar",
orientation = "h",
colors = viridis(length(unique(models$Score))), # Use viridis color palette
text = ~paste("Score: ", round(Score, 3)),
layout = list(
title = "Models Comparison",
xaxis = list(title = "Score"),
yaxis = list(title = "Model"),
template = "plotly_dark"
)
)